{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPU Extreme gradient boosting trained on timestamp text data-set\n",
    "\n",
    "1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)\n",
    "2. Same splitting 80% training, 20% testing, may vary depends on randomness\n",
    "3. Same regex substitution '[^\\\"\\'A-Za-z0-9 ]+'\n",
    "\n",
    "## Example\n",
    "\n",
    "Based on sorted dictionary position\n",
    "\n",
    "text: 'module into which all the refactored classes', matrix: [167, 143, 12, 3, 4, 90]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import sklearn.datasets\n",
    "import re\n",
    "import time\n",
    "import xgboost as xgb\n",
    "import pickle\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('dictionary_emotion.p', 'rb') as fopen:\n",
    "    dict_emotion = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "maxlen = 50\n",
    "data_X = np.zeros((len(trainset_data.data), maxlen))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(data_X.shape[0]):\n",
    "    tokens = trainset_data.data[i].split()[:maxlen]\n",
    "    for no, text in enumerate(tokens[::-1]):\n",
    "        try:\n",
    "            data_X[i, -1 - no] = dict_emotion[text]\n",
    "        except:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(data_X, trainset_data.target, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params_xgd = {\n",
    "    'min_child_weight': 10.0,\n",
    "    'objective': 'multi:softprob',\n",
    "    'eval_metric': 'mlogloss',\n",
    "    'num_class': len(trainset_data.target_names),\n",
    "    'max_depth': 7,\n",
    "    'max_delta_step': 1.8,\n",
    "    'colsample_bytree': 0.4,\n",
    "    'subsample': 0.8,\n",
    "    'eta': 0.03,\n",
    "    'gamma': 0.65,\n",
    "    'num_boost_round' : 700,\n",
    "    'gpu_id': 0,\n",
    "    'tree_method': 'gpu_hist'\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-mlogloss:1.78348\tvalid-mlogloss:1.78357\n",
      "Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.\n",
      "\n",
      "Will train until valid-mlogloss hasn't improved in 200 rounds.\n",
      "[100]\ttrain-mlogloss:1.53407\tvalid-mlogloss:1.545\n",
      "[200]\ttrain-mlogloss:1.49369\tvalid-mlogloss:1.51609\n",
      "[300]\ttrain-mlogloss:1.46646\tvalid-mlogloss:1.50022\n",
      "[400]\ttrain-mlogloss:1.44417\tvalid-mlogloss:1.48872\n",
      "[500]\ttrain-mlogloss:1.42395\tvalid-mlogloss:1.47904\n",
      "[600]\ttrain-mlogloss:1.40675\tvalid-mlogloss:1.47175\n",
      "[700]\ttrain-mlogloss:1.39027\tvalid-mlogloss:1.46504\n",
      "[800]\ttrain-mlogloss:1.37454\tvalid-mlogloss:1.45882\n",
      "[900]\ttrain-mlogloss:1.36001\tvalid-mlogloss:1.45368\n",
      "[1000]\ttrain-mlogloss:1.34602\tvalid-mlogloss:1.44885\n",
      "[1100]\ttrain-mlogloss:1.33241\tvalid-mlogloss:1.4442\n",
      "[1200]\ttrain-mlogloss:1.31944\tvalid-mlogloss:1.44019\n",
      "[1300]\ttrain-mlogloss:1.30672\tvalid-mlogloss:1.4362\n",
      "[1400]\ttrain-mlogloss:1.29459\tvalid-mlogloss:1.43276\n",
      "[1500]\ttrain-mlogloss:1.2826\tvalid-mlogloss:1.42936\n",
      "[1600]\ttrain-mlogloss:1.27102\tvalid-mlogloss:1.42606\n",
      "[1700]\ttrain-mlogloss:1.25938\tvalid-mlogloss:1.42285\n",
      "[1800]\ttrain-mlogloss:1.24856\tvalid-mlogloss:1.42013\n",
      "[1900]\ttrain-mlogloss:1.23785\tvalid-mlogloss:1.41748\n",
      "[2000]\ttrain-mlogloss:1.22709\tvalid-mlogloss:1.41484\n",
      "[2100]\ttrain-mlogloss:1.21666\tvalid-mlogloss:1.41225\n",
      "[2200]\ttrain-mlogloss:1.20661\tvalid-mlogloss:1.4101\n",
      "[2300]\ttrain-mlogloss:1.19652\tvalid-mlogloss:1.40779\n",
      "[2400]\ttrain-mlogloss:1.18672\tvalid-mlogloss:1.40555\n",
      "[2500]\ttrain-mlogloss:1.17705\tvalid-mlogloss:1.40361\n",
      "[2600]\ttrain-mlogloss:1.16751\tvalid-mlogloss:1.40157\n",
      "[2700]\ttrain-mlogloss:1.15809\tvalid-mlogloss:1.39965\n",
      "[2800]\ttrain-mlogloss:1.14897\tvalid-mlogloss:1.39786\n",
      "[2900]\ttrain-mlogloss:1.13997\tvalid-mlogloss:1.39606\n",
      "[3000]\ttrain-mlogloss:1.13123\tvalid-mlogloss:1.39453\n",
      "[3100]\ttrain-mlogloss:1.12257\tvalid-mlogloss:1.39301\n",
      "[3200]\ttrain-mlogloss:1.11378\tvalid-mlogloss:1.39146\n",
      "[3300]\ttrain-mlogloss:1.10517\tvalid-mlogloss:1.38988\n",
      "[3400]\ttrain-mlogloss:1.09703\tvalid-mlogloss:1.38847\n",
      "[3500]\ttrain-mlogloss:1.08871\tvalid-mlogloss:1.38708\n",
      "[3600]\ttrain-mlogloss:1.08079\tvalid-mlogloss:1.38579\n",
      "[3700]\ttrain-mlogloss:1.07295\tvalid-mlogloss:1.38458\n",
      "[3800]\ttrain-mlogloss:1.0653\tvalid-mlogloss:1.38338\n",
      "[3900]\ttrain-mlogloss:1.05769\tvalid-mlogloss:1.38235\n",
      "[4000]\ttrain-mlogloss:1.04998\tvalid-mlogloss:1.38113\n",
      "[4100]\ttrain-mlogloss:1.04247\tvalid-mlogloss:1.3799\n",
      "[4200]\ttrain-mlogloss:1.03512\tvalid-mlogloss:1.37898\n",
      "[4300]\ttrain-mlogloss:1.02772\tvalid-mlogloss:1.37806\n",
      "[4400]\ttrain-mlogloss:1.02053\tvalid-mlogloss:1.37717\n",
      "[4500]\ttrain-mlogloss:1.0135\tvalid-mlogloss:1.37629\n",
      "[4600]\ttrain-mlogloss:1.00657\tvalid-mlogloss:1.37536\n",
      "[4700]\ttrain-mlogloss:0.999714\tvalid-mlogloss:1.3745\n",
      "[4800]\ttrain-mlogloss:0.992989\tvalid-mlogloss:1.37375\n",
      "[4900]\ttrain-mlogloss:0.986378\tvalid-mlogloss:1.37305\n",
      "[5000]\ttrain-mlogloss:0.979788\tvalid-mlogloss:1.3724\n",
      "[5100]\ttrain-mlogloss:0.973293\tvalid-mlogloss:1.37173\n",
      "[5200]\ttrain-mlogloss:0.966882\tvalid-mlogloss:1.37117\n",
      "[5300]\ttrain-mlogloss:0.960498\tvalid-mlogloss:1.37061\n",
      "[5400]\ttrain-mlogloss:0.954371\tvalid-mlogloss:1.37007\n",
      "[5500]\ttrain-mlogloss:0.948171\tvalid-mlogloss:1.36956\n",
      "[5600]\ttrain-mlogloss:0.942044\tvalid-mlogloss:1.36901\n",
      "[5700]\ttrain-mlogloss:0.935881\tvalid-mlogloss:1.36854\n",
      "[5800]\ttrain-mlogloss:0.92993\tvalid-mlogloss:1.36813\n",
      "[5900]\ttrain-mlogloss:0.923838\tvalid-mlogloss:1.36766\n",
      "[6000]\ttrain-mlogloss:0.918089\tvalid-mlogloss:1.36727\n",
      "[6100]\ttrain-mlogloss:0.912351\tvalid-mlogloss:1.3669\n",
      "[6200]\ttrain-mlogloss:0.906669\tvalid-mlogloss:1.36654\n",
      "[6300]\ttrain-mlogloss:0.901104\tvalid-mlogloss:1.36622\n",
      "[6400]\ttrain-mlogloss:0.895511\tvalid-mlogloss:1.36591\n",
      "[6500]\ttrain-mlogloss:0.890092\tvalid-mlogloss:1.36555\n",
      "[6600]\ttrain-mlogloss:0.884573\tvalid-mlogloss:1.36527\n",
      "[6700]\ttrain-mlogloss:0.879064\tvalid-mlogloss:1.36493\n",
      "[6800]\ttrain-mlogloss:0.873813\tvalid-mlogloss:1.36473\n",
      "[6900]\ttrain-mlogloss:0.868591\tvalid-mlogloss:1.36454\n",
      "[7000]\ttrain-mlogloss:0.863292\tvalid-mlogloss:1.36426\n",
      "[7100]\ttrain-mlogloss:0.858082\tvalid-mlogloss:1.36403\n",
      "[7200]\ttrain-mlogloss:0.852984\tvalid-mlogloss:1.36376\n",
      "[7300]\ttrain-mlogloss:0.847897\tvalid-mlogloss:1.36359\n",
      "[7400]\ttrain-mlogloss:0.842881\tvalid-mlogloss:1.36346\n",
      "[7500]\ttrain-mlogloss:0.837853\tvalid-mlogloss:1.36332\n",
      "[7600]\ttrain-mlogloss:0.832971\tvalid-mlogloss:1.36322\n",
      "[7700]\ttrain-mlogloss:0.828155\tvalid-mlogloss:1.36315\n",
      "[7800]\ttrain-mlogloss:0.82335\tvalid-mlogloss:1.36308\n",
      "[7900]\ttrain-mlogloss:0.818641\tvalid-mlogloss:1.36302\n",
      "[8000]\ttrain-mlogloss:0.814018\tvalid-mlogloss:1.36294\n",
      "[8100]\ttrain-mlogloss:0.809366\tvalid-mlogloss:1.36289\n",
      "[8200]\ttrain-mlogloss:0.804811\tvalid-mlogloss:1.36291\n",
      "[8300]\ttrain-mlogloss:0.800271\tvalid-mlogloss:1.36283\n",
      "[8400]\ttrain-mlogloss:0.795808\tvalid-mlogloss:1.36294\n",
      "[8500]\ttrain-mlogloss:0.791467\tvalid-mlogloss:1.36301\n",
      "Stopping. Best iteration:\n",
      "[8307]\ttrain-mlogloss:0.799969\tvalid-mlogloss:1.36283\n",
      "\n",
      "1651.741 Seconds to train xgb\n"
     ]
    }
   ],
   "source": [
    "d_train = xgb.DMatrix(train_X, train_Y)\n",
    "d_valid = xgb.DMatrix(test_X, test_Y)\n",
    "watchlist = [(d_train, 'train'), (d_valid, 'valid')]\n",
    "#with open('clf.p', 'rb') as fopen:\n",
    "#    clf = pickle.load(fopen)\n",
    "t=time.time()\n",
    "clf = xgb.train(params_xgd, d_train, 100000, watchlist, early_stopping_rounds=200, maximize=False, verbose_eval=100)\n",
    "print(round(time.time()-t, 3), 'Seconds to train xgb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.46755116240013433"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(test_Y == np.argmax(clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit), axis = 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf.save_model('xgb-timestamp50.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "bst = xgb.Booster(params_xgd)\n",
    "bst.load_model('xgb-timestamp50.model')\n",
    "with open('xgb-timestamp-param', 'w') as fopen:\n",
    "    fopen.write(json.dumps(params_xgd))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.46752717065329524"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(test_Y == np.argmax(bst.predict(xgb.DMatrix(test_X)), axis = 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.48      0.21      0.30     11320\n",
      "       fear       0.45      0.18      0.25      9658\n",
      "        joy       0.47      0.72      0.57     28342\n",
      "       love       0.27      0.08      0.12      6901\n",
      "    sadness       0.48      0.57      0.52     24103\n",
      "   surprise       0.22      0.07      0.11      3038\n",
      "\n",
      "avg / total       0.45      0.47      0.43     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(test_Y, np.argmax(bst.predict(xgb.DMatrix(test_X)), axis = 1), target_names = trainset_data.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
